library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.4 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 2.0.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## The following object is masked from 'package:purrr':
##
## transpose
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(cvms)
## Warning: package 'cvms' was built under R version 4.1.2
library(imbalance)
## Warning: package 'imbalance' was built under R version 4.1.2
library(mice)
## Warning: package 'mice' was built under R version 4.1.2
##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(ggplot2)
library(caret)
library(dplyr)
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(ROSE)
## Warning: package 'ROSE' was built under R version 4.1.2
## Loaded ROSE 0.0-4
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.1.2
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(e1071)
library(readr)
data <- read.csv('healthcare-dataset-stroke-data.csv', na.strings = c('N/A'))
data <- as.data.table(data)
head(data,10)
## id gender age hypertension heart_disease ever_married work_type
## 1: 9046 Male 67 0 1 Yes Private
## 2: 51676 Female 61 0 0 Yes Self-employed
## 3: 31112 Male 80 0 1 Yes Private
## 4: 60182 Female 49 0 0 Yes Private
## 5: 1665 Female 79 1 0 Yes Self-employed
## 6: 56669 Male 81 0 0 Yes Private
## 7: 53882 Male 74 1 1 Yes Private
## 8: 10434 Female 69 0 0 No Private
## 9: 27419 Female 59 0 0 Yes Private
## 10: 60491 Female 78 0 0 Yes Private
## Residence_type avg_glucose_level bmi smoking_status stroke
## 1: Urban 228.69 36.6 formerly smoked 1
## 2: Rural 202.21 NA never smoked 1
## 3: Rural 105.92 32.5 never smoked 1
## 4: Urban 171.23 34.4 smokes 1
## 5: Rural 174.12 24.0 never smoked 1
## 6: Urban 186.21 29.0 formerly smoked 1
## 7: Rural 70.09 27.4 never smoked 1
## 8: Urban 94.39 22.8 never smoked 1
## 9: Rural 76.15 NA Unknown 1
## 10: Urban 58.57 24.2 Unknown 1
Criteria Definition
*Note: “Unknown” in smoking_status means that the information is unavailable for this patient
summary(data)
## id gender age hypertension
## Min. : 67 Length:5110 Min. : 0.08 Min. :0.00000
## 1st Qu.:17741 Class :character 1st Qu.:25.00 1st Qu.:0.00000
## Median :36932 Mode :character Median :45.00 Median :0.00000
## Mean :36518 Mean :43.23 Mean :0.09746
## 3rd Qu.:54682 3rd Qu.:61.00 3rd Qu.:0.00000
## Max. :72940 Max. :82.00 Max. :1.00000
##
## heart_disease ever_married work_type Residence_type
## Min. :0.00000 Length:5110 Length:5110 Length:5110
## 1st Qu.:0.00000 Class :character Class :character Class :character
## Median :0.00000 Mode :character Mode :character Mode :character
## Mean :0.05401
## 3rd Qu.:0.00000
## Max. :1.00000
##
## avg_glucose_level bmi smoking_status stroke
## Min. : 55.12 Min. :10.30 Length:5110 Min. :0.00000
## 1st Qu.: 77.25 1st Qu.:23.50 Class :character 1st Qu.:0.00000
## Median : 91.89 Median :28.10 Mode :character Median :0.00000
## Mean :106.15 Mean :28.89 Mean :0.04873
## 3rd Qu.:114.09 3rd Qu.:33.10 3rd Qu.:0.00000
## Max. :271.74 Max. :97.60 Max. :1.00000
## NA's :201
str(data)
## Classes 'data.table' and 'data.frame': 5110 obs. of 12 variables:
## $ id : int 9046 51676 31112 60182 1665 56669 53882 10434 27419 60491 ...
## $ gender : chr "Male" "Female" "Male" "Female" ...
## $ age : num 67 61 80 49 79 81 74 69 59 78 ...
## $ hypertension : int 0 0 0 0 1 0 1 0 0 0 ...
## $ heart_disease : int 1 0 1 0 0 0 1 0 0 0 ...
## $ ever_married : chr "Yes" "Yes" "Yes" "Yes" ...
## $ work_type : chr "Private" "Self-employed" "Private" "Private" ...
## $ Residence_type : chr "Urban" "Rural" "Rural" "Urban" ...
## $ avg_glucose_level: num 229 202 106 171 174 ...
## $ bmi : num 36.6 NA 32.5 34.4 24 29 27.4 22.8 NA 24.2 ...
## $ smoking_status : chr "formerly smoked" "never smoked" "never smoked" "smokes" ...
## $ stroke : int 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, ".internal.selfref")=<externalptr>
From the summary, We can make few basic observations:
colors <- c("tomato", "royalblue", "olivedrab1")
tbl <- with(data, table(gender, stroke))
barplot(tbl, legend = TRUE, beside = TRUE, col = colors,
names.arg = c("No Stroke", "Stroke"), main = "Stroke events by gender")
barplot(tbl[, 2], legend = TRUE, col = colors, main = "Confirmed stroke by gender")
Based on the data set, there are more female patients with stroke than males.
library(ggplot2)
library(dplyr)
withstroke <- data %>% filter(stroke==1)
wostroke <- data %>% filter(stroke==0)
stat <- ifelse(data$stroke==1,"stroke","no stroke")
sum(data$stroke)
## [1] 249
table(data$gender,stat)
## stat
## no stroke stroke
## Female 2853 141
## Male 2007 108
## Other 1 0
Of all Participants, 249 are diagnosed with stroke or 4.9% of total participants.
Between gender, 141 for females and 108 on males have stroke or 4.9% and 5.4% respectively
Conclusion: By Gender - we have seen more stroke in men over women at 5.4% and 4.9% respectively. However it isn’t much of a difference to conclude that gender would be a huge factor in developing a stroke.
We will observe more on this analysis to which criteria, wether male or female, could potentially trigger a development of stroke.
withstroke %>% ggplot(aes(age, fill=gender)) + geom_density(alpha=0.2) + ggtitle("Stroke by Age in Male and Female")
In Conclusion, looking at the chart, Age plays a vital factor in the incidence of stroke. As age of the participants progresses so is the prevalence of stroke. Women tend to be prone at an earlier age while more men develop the illness over time.
stat <- ifelse(data$stroke==1,"stroke","no stroke")
highblood <- ifelse(data$hypertension==1,"with hypertension","without hypertension")
table(highblood, stat)
## stat
## highblood no stroke stroke
## with hypertension 432 66
## without hypertension 4429 183
We have a total of 498 participants that are hypertensive or 9.8%.
And 66 of hypertensive participants developed stroke or 13.25%
On the other hand, we have 4612 participants without hypertension and 183 of them had a stroke or 3.97%
In conclusion, with hypertension, you get 3 times as much chance of developing stroke.
colors <- c("royalblue", "tomato")
tbl <- with(data, table(hypertension, stroke))
barplot(tbl, legend = TRUE, legend.text = c("Hypertension", "No Hypertension"),
beside = TRUE, col = colors,
names.arg = c("No Stroke", "Stroke"),
main = "Stroke events by hypertension diagnosis")
barplot(tbl[, 2], col = colors,
main = "Confirmed stroke events by hypertension diagnosis",
names.arg = c("Without Hypertension", "With Hypertension"))
It’s surprising that most confirmed stroke events are from patients without an hypertension diagnosis.
The next analysis will be the comparison of stroke events and a heart disease background.
stat <- ifelse(data$stroke==1,"stroke","no stroke")
heart.disease <- ifelse(data$heart_disease==1,"with heart disease","without heart disease")
table(heart.disease, stat)
## stat
## heart.disease no stroke stroke
## with heart disease 229 47
## without heart disease 4632 202
We have a total of 276 participants that have heart disease or 5.4% of overall participants,
47 of those or 17% have developed stroke.
While 4834 OF participants do not have heart disease yet 202 have developed stroke or 4.2%
In conclusion, with heart disease, you get 4 times as much chance of developing stroke.
colors <- c("royalblue", "tomato")
tbl <- with(data, table(heart_disease, stroke))
barplot(tbl, legend = TRUE, legend.text = c("Without heart disease", "With heart disease"),
beside = TRUE, col = colors,
names.arg = c('No Stroke', 'Stroke'),
main = "Stroke events by heart disease background")
barplot(tbl[, 2], col = colors, main = "Confirmed stroke events by heart disease background",
names.arg = c("Without heart disease", "With heart disease"))
As shown in the second chart, most of patients with stroke don’t have heart diseases.
sum(data$hypertension==1 & data$heart_disease==1)
## [1] 64
Inference: Of all applicants, 64 have both hypertension and heart disease
sum(data$hypertension==1 & data$heart_disease==1 & data$stroke==1)
## [1] 13
And 13 of those developed stroke or 20.31%
Overall, the risk of stroke is 6 times as high if you have both hypertension and heart disease.
Keep Track of your heart health to prevent stroke.
withstroke %>% ggplot(aes(avg_glucose_level, fill=gender)) + geom_density(alpha=0.2) + ggtitle("Stroke and Glucose Level by Gender")
withstroke %>% ggplot(aes(age, avg_glucose_level, color=gender)) + geom_point() + ggtitle("Stroke and Glucose Level over Time")
More participants with lower glucose level developed stroke and mostly were women while the opposite spectrum of elevated glucose levels that developed the illness were men.
In conclusion, keeping one’s glucose level at normal level will decrease the chance of developing the illness over time.
hist(data$bmi, col = "royalblue", main = "BMI distribution", xlab = 'BMI')
bmi_withstroke <- ifelse(withstroke$bmi=="N/A",0,withstroke$bmi)
bmi_withstroke <- as.numeric(bmi_withstroke)
hist(bmi_withstroke)
mean(bmi_withstroke)
## [1] 30.03124
From the participants with stroke (249), the normal distribution of BMI is 25.57 concluding that even having close to a normal range of BMI, stroke can potentially develop.
ever.married <- data$ever_married
table(ever.married,stat)
## stat
## ever.married no stroke stroke
## No 1728 29
## Yes 3133 220
For the non-married only 29 participants developed stroke and 220 for the married.
The probability, for the non-married the development of the condition is 1.7% and for the married at 6.6%.
In summary for married individuals, one can get 4 times as much chance of developing stroke. Therefore, make your wife happy and everything will be alright.
We will compare the residence of the patients with the stroke events registered.
colors <- c("tomato", "royalblue")
tbl <- with(data, table(Residence_type, stroke))
barplot(tbl, legend = TRUE, beside = TRUE, col = colors,
names.arg = c("No Stroke", "Stroke"),
main = "Stroke events by patient's Residence type")
barplot(tbl[, 2], col = colors,
main = "Confirmed stroke events by patient's Residence type")
Our participants have about the same number in place of residence, either rural or urban.
reside <- data$Residence_type
table(reside, stat)
## stat
## reside no stroke stroke
## Rural 2400 114
## Urban 2461 135
Rural 4.53% Urban 5.20%
In conclusion, based on our sample, there is not much extreme difference in place of residence that would trigger a development of stroke. Whatever or wherever place you may be living, the potential exists almost as much.
We will check the influence of work type in stroke events.
colors <- c("tomato", "royalblue", "olivedrab1", "mediumpurple", "turquoise")
tbl <- with(data, table(work_type, stroke))
barplot(tbl, legend = TRUE, beside = TRUE, col = colors,
names.arg = c("No Stroke", "Stroke"), main = "Stroke events by patient's work type")
barplot(tbl[, 2], col = colors, main = "Confirmed stroke events by patient's work type")
Most patients in the data set have jobs on the private sector. That is reflected in both charts where private sector is the highest value for both, no stroke and confirmed stroke.
work <- data$work_type
table( work, stat)
## stat
## work no stroke stroke
## children 685 2
## Govt_job 624 33
## Never_worked 22 0
## Private 2776 149
## Self-employed 754 65
Statistically, the probability of developing a stroke by work:
Children 0.29% Gov_job 5.02% Never_worked 0% Private 5.09% Self-employed 7.94%
Based on participants data, those that never worked had 0 probability of developing stroke, whereas children had small a one. Both Government and Private work sectors have the same probability and the Self-employed have the highest probability.
The relation between smoking habits and stroke events.
colors <- c("tomato", "royalblue", "olivedrab1", "mediumpurple")
tbl <- with(data, table(smoking_status, stroke))
barplot(tbl, legend = TRUE, beside = TRUE, col = colors,
names.arg = c("No Stroke", "Stroke"), main = "Stroke events by smoking habits")
barplot(tbl[, 2], col = colors,
main = "Confirmed stroke events by smoking habits")
Surprisingly, patients that never smoked or that smoked in the past have more stroke events than active smokers. Although, we have to keep in mind that a notable portion of the data doesn’t have a clear register of the smoking habits of the patient, represented by the unknown category.
The three statuses should be enough to determine individual probabilities.
smoke <- data$smoking_status
table( smoke, stat)
## stat
## smoke no stroke stroke
## formerly smoked 815 70
## never smoked 1802 90
## smokes 747 42
## Unknown 1497 47
Statistically, the probability of developing a stroke by Smoking:
formerly 7.91% never smoked 4.76% smokes 5.32% Unknown 3.04%%
Smoking, even if you have stopped already, increases the risk of stroke as the data tells us.
Before training models, the data must be prepared. We decided to use the one-hot encoding technique, converting the categorical variables into multiple ones, each one with a value of 0 or 1.
Also, age, average glucose level and bmi variables will be standarized.
data$age <- (data$age - mean(data$age)) / sd(data$age)
data$bmi <- (data$bmi - mean(data$bmi)) / sd(data$bmi)
data$avg_glucose_level <- (data$avg_glucose_level - mean(data$avg_glucose_level)) / sd(data$avg_glucose_level)
dummy <- dummyVars(" ~ . ", data = data)
data <- data.frame(predict(dummy, newdata = data))
Now, we will check for class imabalance.
table(data$stroke)
##
## 0 1
## 4861 249
We will use MWMOTE (Majority Weighted Minority Oversampling Technique) for oversampling the stroke class and reduce class imbalance.
oversampled <- mwmote(data, classAttr = "stroke", numInstances = 500)
oversampled <- round(oversampled)
First, we need to create a training and testing data set. We will use an 80:20 approach, 80% of the data to the training set and 20% for the final testing.
We also will use the K-folds cross validation method with K = 5 on the training set.
set.seed(1203)
fullData <- rbind(data, oversampled)
# Target class needs to be a factor
fullData$stroke <- factor(fullData$stroke)
sample <- createDataPartition(y = fullData$stroke, p = 0.8, list = FALSE)
train <- fullData[sample, ]
test <- fullData[-sample, ]
train_control <- trainControl(method = "cv", number = 5)
The models to evaluate are Random Forest, K-Nearest Neighbor and Logistic Regression
randomForest <- train(stroke ~ ., data = train, method = "rf", trControl = train_control)
randomForest
## Random Forest
##
## 4489 samples
## 22 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 3591, 3591, 3591, 3592, 3591
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.9120056 0.4729784
## 12 0.9563378 0.7844875
## 22 0.9529967 0.7705975
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 12.
knn <- train(stroke~., data = train, method = "knn", trControl = train_control)
knn
## k-Nearest Neighbors
##
## 4489 samples
## 22 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 3591, 3591, 3592, 3591, 3591
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 5 0.9169081 0.6296347
## 7 0.9164617 0.6350859
## 9 0.9097794 0.6050036
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 5.
logisticRegression <- train(stroke~., data = train, method = "glm",
trControl = train_control,
family = "binomial")
logisticRegression
## Generalized Linear Model
##
## 4489 samples
## 22 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 3591, 3591, 3591, 3592, 3591
## Resampling results:
##
## Accuracy Kappa
## 0.875474 0.2679571
test$prediction <- predict(randomForest, newdata = test)
test$prediction <- as.character(test$prediction)
conf_matrix <- evaluate(test, target_col = "stroke", prediction_cols = "prediction",
type = "binomial", positive = "1")
plot_confusion_matrix(conf_matrix)
library(tidyverse)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(ggsci)
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.1.2
## corrplot 0.92 loaded
library(ggplot2)
library(lattice)
library(randomForest)
library(remote)
## Warning: package 'remote' was built under R version 4.1.2
## Loading required package: Rcpp
## Loading required package: raster
## Loading required package: sp
##
## Attaching package: 'raster'
## The following object is masked from 'package:plotly':
##
## select
## The following object is masked from 'package:e1071':
##
## interpolate
## The following object is masked from 'package:data.table':
##
## shift
## The following object is masked from 'package:dplyr':
##
## select
## The following object is masked from 'package:tidyr':
##
## extract
library(data.table)
library(pROC)
library(cvms)
library(imbalance)
library(mice)
library(GGally)
library(ROSE)
library(randomForest)
library(e1071)
diab<-read_csv("diabetes.csv")
## Rows: 768 Columns: 9
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl (9): Pregnancies, Glucose, BloodPressure, SkinThickness, Insulin, BMI, D...
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(diab,10)
## # A tibble: 10 x 9
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 6 148 72 35 0 33.6
## 2 1 85 66 29 0 26.6
## 3 8 183 64 0 0 23.3
## 4 1 89 66 23 94 28.1
## 5 0 137 40 35 168 43.1
## 6 5 116 74 0 0 25.6
## 7 3 78 50 32 88 31
## 8 10 115 0 0 0 35.3
## 9 2 197 70 45 543 30.5
## 10 8 125 96 0 0 0
## # ... with 3 more variables: DiabetesPedigreeFunction <dbl>, Age <dbl>,
## # Outcome <dbl>
head(diab)
## # A tibble: 6 x 9
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigre~
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 6 148 72 35 0 33.6 0.627
## 2 1 85 66 29 0 26.6 0.351
## 3 8 183 64 0 0 23.3 0.672
## 4 1 89 66 23 94 28.1 0.167
## 5 0 137 40 35 168 43.1 2.29
## 6 5 116 74 0 0 25.6 0.201
## # ... with 2 more variables: Age <dbl>, Outcome <dbl>
tail(diab)
## # A tibble: 6 x 9
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigre~
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 9 89 62 0 0 22.5 0.142
## 2 10 101 76 48 180 32.9 0.171
## 3 2 122 70 27 0 36.8 0.34
## 4 5 121 72 23 112 26.2 0.245
## 5 1 126 60 0 0 30.1 0.349
## 6 1 93 70 31 0 30.4 0.315
## # ... with 2 more variables: Age <dbl>, Outcome <dbl>
colnames(diab)
## [1] "Pregnancies" "Glucose"
## [3] "BloodPressure" "SkinThickness"
## [5] "Insulin" "BMI"
## [7] "DiabetesPedigreeFunction" "Age"
## [9] "Outcome"
str(diab)
## spec_tbl_df [768 x 9] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Pregnancies : num [1:768] 6 1 8 1 0 5 3 10 2 8 ...
## $ Glucose : num [1:768] 148 85 183 89 137 116 78 115 197 125 ...
## $ BloodPressure : num [1:768] 72 66 64 66 40 74 50 0 70 96 ...
## $ SkinThickness : num [1:768] 35 29 0 23 35 0 32 0 45 0 ...
## $ Insulin : num [1:768] 0 0 0 94 168 0 88 0 543 0 ...
## $ BMI : num [1:768] 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
## $ DiabetesPedigreeFunction: num [1:768] 0.627 0.351 0.672 0.167 2.288 ...
## $ Age : num [1:768] 50 31 32 21 33 30 26 29 53 54 ...
## $ Outcome : num [1:768] 1 0 1 0 1 0 1 0 1 1 ...
## - attr(*, "spec")=
## .. cols(
## .. Pregnancies = col_double(),
## .. Glucose = col_double(),
## .. BloodPressure = col_double(),
## .. SkinThickness = col_double(),
## .. Insulin = col_double(),
## .. BMI = col_double(),
## .. DiabetesPedigreeFunction = col_double(),
## .. Age = col_double(),
## .. Outcome = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
summary(diab)
## Pregnancies Glucose BloodPressure SkinThickness
## Min. : 0.000 Min. : 0.0 Min. : 0.00 Min. : 0.00
## 1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 62.00 1st Qu.: 0.00
## Median : 3.000 Median :117.0 Median : 72.00 Median :23.00
## Mean : 3.845 Mean :120.9 Mean : 69.11 Mean :20.54
## 3rd Qu.: 6.000 3rd Qu.:140.2 3rd Qu.: 80.00 3rd Qu.:32.00
## Max. :17.000 Max. :199.0 Max. :122.00 Max. :99.00
## Insulin BMI DiabetesPedigreeFunction Age
## Min. : 0.0 Min. : 0.00 Min. :0.0780 Min. :21.00
## 1st Qu.: 0.0 1st Qu.:27.30 1st Qu.:0.2437 1st Qu.:24.00
## Median : 30.5 Median :32.00 Median :0.3725 Median :29.00
## Mean : 79.8 Mean :31.99 Mean :0.4719 Mean :33.24
## 3rd Qu.:127.2 3rd Qu.:36.60 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :846.0 Max. :67.10 Max. :2.4200 Max. :81.00
## Outcome
## Min. :0.000
## 1st Qu.:0.000
## Median :0.000
## Mean :0.349
## 3rd Qu.:1.000
## Max. :1.000
diab$Outcome<-factor(diab$Outcome)
class(diab$Outcome)
## [1] "factor"
sapply(diab, function(x) sum(is.na(x)))
## Pregnancies Glucose BloodPressure
## 0 0 0
## SkinThickness Insulin BMI
## 0 0 0
## DiabetesPedigreeFunction Age Outcome
## 0 0 0
qplot(diab$Age, diab$BMI, xlab = "Patient Age", ylab ="Patient BMI")
ggplot(diab,aes(x=Glucose)) +
geom_histogram(binwidth=30,color='black',fill='chartreuse3')+
xlab('Glucose Level')+ylab('No. of people')
p1<-ggplot(diab,aes(x=Age,y=Pregnancies,col=Outcome))+geom_point()+geom_smooth(method="loess", se=T)+facet_grid(.~Outcome) + ggtitle("ScatterPlot for Pregnancies vs Age")
ggplotly(p1)
## `geom_smooth()` using formula 'y ~ x'
pairs(diab, panel = panel.smooth)
corrplot(cor(diab[, -9]),method = "number",title="Correlation Plot between the variables")
ggplot(diab, aes(Glucose))+
geom_density(aes(fill=factor(Outcome)), alpha=0.8) +
labs(title="Density plot",
subtitle="Glucose based on Diabetic Outcome",
x="Glucose",
fill="Diabetes")
ggplot(diab, aes(Insulin))+
geom_density(aes(fill=factor(Outcome)), alpha=0.8) +
labs(title="Density plot",
subtitle="Insulin level by Diabetic Outcome",
x="Insulin",
fill="Diabetes")
p3<-ggplot(diab,aes(x=Pregnancies))+geom_density(aes(fill=Outcome),alpha=0.6)+ ggtitle("Density Plot")+
geom_vline(aes(xintercept=mean(Pregnancies)),
color="blue", linetype="dashed", size=1)+facet_grid(.~Outcome)+scale_fill_aaas()
ggplotly(p3)
p3<-ggplot(diab,aes(x=Age, y=Pregnancies, size=Glucose, fill=BloodPressure))+geom_point(alpha=0.2)+
facet_grid(.~Outcome)+geom_jitter(width = 0.4)+scale_x_continuous(limits = c(18, 80))+scale_fill_material("red")
ggplotly(p3)
## Warning in L$marker$color[idx] <- aes2plotly(data, params, "fill")[idx]: number
## of items to replace is not a multiple of replacement length
## Warning in L$marker$color[idx] <- aes2plotly(data, params, "fill")[idx]: number
## of items to replace is not a multiple of replacement length
## Warning in L$marker$color[idx] <- aes2plotly(data, params, "fill")[idx]: number
## of items to replace is not a multiple of replacement length
## Warning in L$marker$color[idx] <- aes2plotly(data, params, "fill")[idx]: number
## of items to replace is not a multiple of replacement length
library(caret) # ML package for various methods
# Create the training and test datasets
set.seed(100)
hci<-diab
# Step 1: Get row numbers for the training data
trainRowNumbers <- createDataPartition(hci$Outcome, p=0.8, list=FALSE) # Data partition for dividing the dataset into training and testing data set. This is useful for cross validation
# Step 2: Create the training dataset
trainData <- hci[trainRowNumbers,]
# Step 3: Create the test dataset
testData <- hci[-trainRowNumbers,]
# Store X and Y for later use.
x = trainData[, 1:8]
y=trainData$Outcome
xt= testData[, 1:8]
yt=testData$Outcome
# # See the structure of the new dataset
preProcess_range_modeltr <- preProcess(trainData, method='range')
preProcess_range_modelts <- preProcess(testData, method='range')
trainData <- predict(preProcess_range_modeltr, newdata = trainData)
testData <- predict(preProcess_range_modelts, newdata = testData)
# Append the Y variable
trainData$Outcome <- y
testData$Outcome<-yt
levels(trainData$Outcome) <- c("Class0", "Class1") # Convert binary outcome into character for caret package
levels(testData$Outcome) <- c("Class0", "Class1")
fitControl <- trainControl(
method = 'cv', # k-fold cross validation
number = 5, # number of folds
savePredictions = 'final', # saves predictions for optimal tuning parameter
classProbs = T, # should class probabilities be returned
summaryFunction=twoClassSummary # results summary function
)
# Preparing the DataSet
set.seed(123)
n <- nrow(diab)
train <- sample(n, trunc(0.70*n))
pima_training <- diab[train, ]
pima_testing <- diab[-train, ]
# Training The Model
glm_fm1 <- glm(Outcome ~., data = pima_training, family = binomial)
summary(glm_fm1)
##
## Call:
## glm(formula = Outcome ~ ., family = binomial, data = pima_training)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2424 -0.7256 -0.4283 0.7341 2.9311
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.405409 0.841872 -9.984 < 2e-16 ***
## Pregnancies 0.103471 0.037973 2.725 0.00643 **
## Glucose 0.035730 0.004563 7.830 4.89e-15 ***
## BloodPressure -0.012707 0.006057 -2.098 0.03590 *
## SkinThickness 0.003563 0.008088 0.440 0.65959
## Insulin -0.001710 0.001060 -1.613 0.10671
## BMI 0.088735 0.017954 4.942 7.72e-07 ***
## DiabetesPedigreeFunction 0.696250 0.334761 2.080 0.03754 *
## Age 0.017015 0.011066 1.538 0.12415
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 694.17 on 536 degrees of freedom
## Residual deviance: 509.76 on 528 degrees of freedom
## AIC: 527.76
##
## Number of Fisher Scoring iterations: 5
glm_fm2 <- update(glm_fm1, ~. - Triceps_Skin - Serum_Insulin - Age )
summary(glm_fm2)
##
## Call:
## glm(formula = Outcome ~ Pregnancies + Glucose + BloodPressure +
## SkinThickness + Insulin + BMI + DiabetesPedigreeFunction,
## family = binomial, data = pima_training)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3085 -0.7375 -0.4437 0.7205 2.9988
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.115329 0.811450 -10.001 < 2e-16 ***
## Pregnancies 0.133160 0.032937 4.043 5.28e-05 ***
## Glucose 0.037054 0.004513 8.211 < 2e-16 ***
## BloodPressure -0.011289 0.005979 -1.888 0.0590 .
## SkinThickness 0.002457 0.007997 0.307 0.7587
## Insulin -0.001749 0.001058 -1.653 0.0984 .
## BMI 0.086230 0.017822 4.838 1.31e-06 ***
## DiabetesPedigreeFunction 0.732824 0.333681 2.196 0.0281 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 694.17 on 536 degrees of freedom
## Residual deviance: 512.11 on 529 degrees of freedom
## AIC: 528.11
##
## Number of Fisher Scoring iterations: 5
# Testing the Model
glm_probs <- predict(glm_fm2, newdata = pima_testing, type = "response")
glm_pred <- ifelse(glm_probs > 0.5, "1", "0")
pred_glm = as.factor(glm_pred)
confusionMatrix(pred_glm,pima_testing$Outcome)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 136 36
## 1 14 45
##
## Accuracy : 0.7835
## 95% CI : (0.7248, 0.8349)
## No Information Rate : 0.6494
## P-Value [Acc > NIR] : 6.46e-06
##
## Kappa : 0.493
##
## Mcnemar's Test P-Value : 0.002979
##
## Sensitivity : 0.9067
## Specificity : 0.5556
## Pos Pred Value : 0.7907
## Neg Pred Value : 0.7627
## Prevalence : 0.6494
## Detection Rate : 0.5887
## Detection Prevalence : 0.7446
## Balanced Accuracy : 0.7311
##
## 'Positive' Class : 0
##
acc_glm_fit <- confusionMatrix(pred_glm,pima_testing$Outcome)$overall['Accuracy']
# Preparing the DataSet:
diab$Outcome <- as.factor(diab$Outcome)
library(caret)
library(tree)
## Registered S3 method overwritten by 'tree':
## method from
## print.tree cli
library(e1071)
set.seed(1000)
intrain <- createDataPartition(y = diab$Outcome, p = 0.7, list = FALSE)
train <- diab[intrain, ]
test <- diab[-intrain, ]
# Training The Model
treemod <- tree(Outcome ~ ., data = train)
summary(treemod)
##
## Classification tree:
## tree(formula = Outcome ~ ., data = train)
## Variables actually used in tree construction:
## [1] "Glucose" "BMI"
## [3] "Pregnancies" "DiabetesPedigreeFunction"
## [5] "Insulin"
## Number of terminal nodes: 16
## Residual mean deviance: 0.7515 = 392.3 / 522
## Misclassification error rate: 0.1747 = 94 / 538
treemod # get a detailed text output.
## node), split, n, deviance, yval, (yprob)
## * denotes terminal node
##
## 1) root 538 696.300 0 ( 0.65056 0.34944 )
## 2) Glucose < 127.5 337 324.700 0 ( 0.81306 0.18694 )
## 4) BMI < 26.45 95 26.640 0 ( 0.96842 0.03158 ) *
## 5) BMI > 26.45 242 271.100 0 ( 0.75207 0.24793 )
## 10) Pregnancies < 4.5 153 129.500 0 ( 0.84967 0.15033 )
## 20) BMI < 45.4 146 109.000 0 ( 0.87671 0.12329 )
## 40) DiabetesPedigreeFunction < 0.5005 92 38.850 0 ( 0.94565 0.05435 ) *
## 41) DiabetesPedigreeFunction > 0.5005 54 59.610 0 ( 0.75926 0.24074 )
## 82) BMI < 32.2 24 8.314 0 ( 0.95833 0.04167 ) *
## 83) BMI > 32.2 30 40.380 0 ( 0.60000 0.40000 ) *
## 21) BMI > 45.4 7 8.376 1 ( 0.28571 0.71429 ) *
## 11) Pregnancies > 4.5 89 120.800 0 ( 0.58427 0.41573 )
## 22) Glucose < 103.5 42 46.110 0 ( 0.76190 0.23810 )
## 44) DiabetesPedigreeFunction < 0.291 13 0.000 0 ( 1.00000 0.00000 ) *
## 45) DiabetesPedigreeFunction > 0.291 29 37.360 0 ( 0.65517 0.34483 )
## 90) BMI < 38.65 23 24.080 0 ( 0.78261 0.21739 ) *
## 91) BMI > 38.65 6 5.407 1 ( 0.16667 0.83333 ) *
## 23) Glucose > 103.5 47 64.110 1 ( 0.42553 0.57447 ) *
## 3) Glucose > 127.5 201 266.600 1 ( 0.37811 0.62189 )
## 6) BMI < 29.95 51 55.650 0 ( 0.76471 0.23529 ) *
## 7) BMI > 29.95 150 167.600 1 ( 0.24667 0.75333 )
## 14) Glucose < 165.5 96 122.200 1 ( 0.33333 0.66667 )
## 28) DiabetesPedigreeFunction < 0.433 44 60.910 1 ( 0.47727 0.52273 ) *
## 29) DiabetesPedigreeFunction > 0.433 52 53.660 1 ( 0.21154 0.78846 )
## 58) Insulin < 297.5 46 35.620 1 ( 0.13043 0.86957 ) *
## 59) Insulin > 297.5 6 5.407 0 ( 0.83333 0.16667 ) *
## 15) Glucose > 165.5 54 33.320 1 ( 0.09259 0.90741 )
## 30) DiabetesPedigreeFunction < 1.1565 47 16.540 1 ( 0.04255 0.95745 )
## 60) DiabetesPedigreeFunction < 0.206 8 8.997 1 ( 0.25000 0.75000 ) *
## 61) DiabetesPedigreeFunction > 0.206 39 0.000 1 ( 0.00000 1.00000 ) *
## 31) DiabetesPedigreeFunction > 1.1565 7 9.561 1 ( 0.42857 0.57143 ) *
plot(treemod)
text(treemod, pretty = 0)
# Testing the Model
tree_pred <- predict(treemod, newdata = test, type = "class" )
tree_pred = as.factor(tree_pred)
confusionMatrix(tree_pred, test$Outcome)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 119 36
## 1 31 44
##
## Accuracy : 0.7087
## 95% CI : (0.6454, 0.7666)
## No Information Rate : 0.6522
## P-Value [Acc > NIR] : 0.04036
##
## Kappa : 0.3484
##
## Mcnemar's Test P-Value : 0.62507
##
## Sensitivity : 0.7933
## Specificity : 0.5500
## Pos Pred Value : 0.7677
## Neg Pred Value : 0.5867
## Prevalence : 0.6522
## Detection Rate : 0.5174
## Detection Prevalence : 0.6739
## Balanced Accuracy : 0.6717
##
## 'Positive' Class : 0
##
acc_treemod <- confusionMatrix(tree_pred, test$Outcome)$overall['Accuracy']
data <- as.data.table(diab)
head(data,10)
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 1: 6 148 72 35 0 33.6
## 2: 1 85 66 29 0 26.6
## 3: 8 183 64 0 0 23.3
## 4: 1 89 66 23 94 28.1
## 5: 0 137 40 35 168 43.1
## 6: 5 116 74 0 0 25.6
## 7: 3 78 50 32 88 31.0
## 8: 10 115 0 0 0 35.3
## 9: 2 197 70 45 543 30.5
## 10: 8 125 96 0 0 0.0
## DiabetesPedigreeFunction Age Outcome
## 1: 0.627 50 1
## 2: 0.351 31 0
## 3: 0.672 32 1
## 4: 0.167 21 0
## 5: 2.288 33 1
## 6: 0.201 30 0
## 7: 0.248 26 1
## 8: 0.134 29 0
## 9: 0.158 53 1
## 10: 0.232 54 1
oversampled <- mwmote(data, classAttr = "Outcome", numInstances = 500)
#oversampled <- round(oversampled)
set.seed(1203)
fullData <- rbind(data, oversampled)
# Target class needs to be a factor
fullData$Outcome <- factor(fullData$Outcome)
sample <- createDataPartition(y = fullData$Outcome, p = 0.8, list = FALSE)
train <- fullData[sample, ]
test <- fullData[-sample, ]
train_control <- trainControl(method = "cv", number = 5)
randomForest <- train(Outcome ~ ., data = train, method = "rf", trControl = train_control)
randomForest
## Random Forest
##
## 1015 samples
## 8 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 812, 812, 812, 812, 812
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.8463054 0.6698108
## 5 0.8453202 0.6696772
## 8 0.8394089 0.6563198
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
test$prediction <- predict(randomForest, newdata = test)
test$prediction <- as.character(test$prediction)
conf_matrix <- evaluate(test, target_col = "Outcome", prediction_cols = "prediction",
type = "binomial", positive = "1")
acc_rf_pred=conf_matrix$Accuracy
diab$Outcome <- as.factor(diab$Outcome)
library(e1071)
#Preparing the DataSet:
set.seed(1000)
intrain <- createDataPartition(y = diab$Outcome, p = 0.7, list = FALSE)
train <- diab[intrain, ]
test <- diab[-intrain, ]
tuned <- tune.svm(Outcome ~., data = train, gamma = 10^(-6:-1), cost = 10^(-1:1))
summary(tuned) # to show the results
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## gamma cost
## 0.01 1
##
## - best performance: 0.2285814
##
## - Detailed performance results:
## gamma cost error dispersion
## 1 1e-06 0.1 0.3493711 0.04098157
## 2 1e-05 0.1 0.3493711 0.04098157
## 3 1e-04 0.1 0.3493711 0.04098157
## 4 1e-03 0.1 0.3493711 0.04098157
## 5 1e-02 0.1 0.3493711 0.04098157
## 6 1e-01 0.1 0.2619147 0.05748958
## 7 1e-06 1.0 0.3493711 0.04098157
## 8 1e-05 1.0 0.3493711 0.04098157
## 9 1e-04 1.0 0.3493711 0.04098157
## 10 1e-03 1.0 0.3493711 0.04280075
## 11 1e-02 1.0 0.2285814 0.04761498
## 12 1e-01 1.0 0.2490217 0.03788081
## 13 1e-06 10.0 0.3493711 0.04098157
## 14 1e-05 10.0 0.3493711 0.04098157
## 15 1e-04 10.0 0.3493711 0.04280075
## 16 1e-03 10.0 0.2341719 0.04782051
## 17 1e-02 10.0 0.2379106 0.04426620
## 18 1e-01 10.0 0.2564640 0.05952472
svm_model <- svm(Outcome ~., data = train, kernel = "radial", gamma = 0.01, cost = 10)
summary(svm_model)
##
## Call:
## svm(formula = Outcome ~ ., data = train, kernel = "radial", gamma = 0.01,
## cost = 10)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 10
##
## Number of Support Vectors: 284
##
## ( 142 142 )
##
##
## Number of Classes: 2
##
## Levels:
## 0 1
svm_pred <- predict(svm_model, newdata = test)
confusionMatrix(svm_pred, test$Outcome)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 136 39
## 1 14 41
##
## Accuracy : 0.7696
## 95% CI : (0.7097, 0.8224)
## No Information Rate : 0.6522
## P-Value [Acc > NIR] : 7.748e-05
##
## Kappa : 0.4521
##
## Mcnemar's Test P-Value : 0.0009784
##
## Sensitivity : 0.9067
## Specificity : 0.5125
## Pos Pred Value : 0.7771
## Neg Pred Value : 0.7455
## Prevalence : 0.6522
## Detection Rate : 0.5913
## Detection Prevalence : 0.7609
## Balanced Accuracy : 0.7096
##
## 'Positive' Class : 0
##
acc_svm_model <- confusionMatrix(svm_pred, test$Outcome)$overall['Accuracy']
accuracy <- data.frame(Model=c("Logistic Regression","Decision Tree","Random Forest","Support Vector Machine (SVM)"), Accuracy=c(acc_glm_fit, acc_treemod,acc_rf_pred,acc_svm_model))
ggplot(accuracy,aes(x=Model,y=Accuracy)) + geom_bar(stat='identity') + theme_bw() + ggtitle('Comparison of Model Accuracy')
plot_confusion_matrix(conf_matrix)
```